#!/usr/bin/env python3

# pdf2htmlEX creates images inline the HTML as <img src="data:" />
# This script:
#     - removes the src="" attribute
#     - creates a new bg-N class for the img tag
#     - copies the images into the CSS

import bleach
import glob
import os
import re
import sys
from pyquery import PyQuery as pq

# Must be given a HTML file (output of pdf2htmlEX)
if len (sys.argv) == 1:
    exit ()

# Which item to process
html_file = sys.argv[1]
# The CSS file of the document
css_file = html_file[:-5] + '.images.css'

if not os.path.isfile (html_file):
    print ("Input file doesn't exist in the library.")
    exit ()

# Store all the base64 images here
images = {}

# Remove base64 image from <img src="">
def background_to_css (index, element):
    new_css_class = 'bg-' + str (index)
    
    images['.' + new_css_class] = element.attr.src
    
    element.remove_attr ('alt')
    element.remove_attr ('src')
    element.add_class (new_css_class)

# Read HTML file
with open (html_file, 'rt', encoding='utf-8') as f:
    dom = pq (f.read ())

# Open sidebar
if len (dom ('#outline > ul > li')) > 0:
    dom ('#sidebar').addClass ('opened')

# Loop all images
dom ('#page-container img.bi').each (lambda index, item: background_to_css (index, dom (item)))

# Overwrite HTML file with removed images
with open (html_file, 'wt', encoding='utf-8') as f:
    # f.write (dom ('#sidebar').outer_html ())
    f.write (dom ('#page-container').outer_html ())

# Append new CSS classes (with the images) to the document CSS file
with open (css_file, 'at', encoding='utf-8') as f:
    for image, base64 in images.items ():
        f.write (image + '{')
        f.write ('background-image: url(' + base64 + ');')
        f.write ('background-position: center;')
        f.write ('background-repeat: no-repeat;')
        f.write ('background-size: cover;')
        f.write ('}')





